{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.2"
    },
    "colab": {
      "name": "Albert.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "5d36029ccf7d4b9c8e0a4e5d7392e944": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_067ec527c68745adb72775e5515763b1",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_21783d5c40514b208b60a3dbfbd468f1",
              "IPY_MODEL_0c66d3d94b2e4d8297c93cad9a722bfc"
            ]
          }
        },
        "067ec527c68745adb72775e5515763b1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "21783d5c40514b208b60a3dbfbd468f1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_1359af5c37184a60af691b2fd78a20df",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 238187,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 238187,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d79a084d2a124832b0200c79448b04b4"
          }
        },
        "0c66d3d94b2e4d8297c93cad9a722bfc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_a49616b738d14cff917e1729fde435c4",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 238k/238k [00:01&lt;00:00, 129kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d5ac57372b6c4f3e916c090e51c571e7"
          }
        },
        "1359af5c37184a60af691b2fd78a20df": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d79a084d2a124832b0200c79448b04b4": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "a49616b738d14cff917e1729fde435c4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d5ac57372b6c4f3e916c090e51c571e7": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "7ecbd6466c624fb9b8ef305f96e91f9e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_94cb3cf81069411287551f8966516876",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_990a7e655f9a47f482685f97eb049766",
              "IPY_MODEL_c9e40a709153400799dbb40546643619"
            ]
          }
        },
        "94cb3cf81069411287551f8966516876": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "990a7e655f9a47f482685f97eb049766": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_826e06ed24e24207aa1cc3476b62ceea",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 505,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 505,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_c1c8a6051b874c94ba48551cfb61852e"
          }
        },
        "c9e40a709153400799dbb40546643619": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_c885e37af03b4429a8544e6b4ee76552",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 505/505 [00:26&lt;00:00, 19.2B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_22efb24d14d243b8b04299bcfae10650"
          }
        },
        "826e06ed24e24207aa1cc3476b62ceea": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "c1c8a6051b874c94ba48551cfb61852e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "c885e37af03b4429a8544e6b4ee76552": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "22efb24d14d243b8b04299bcfae10650": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "60ad2075fad946eb90daefc361845a5a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_c5b8c3ed934b4b8a81f1b1f2343df9cc",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_0e88793aa5a84c0cb42002605c5dab44",
              "IPY_MODEL_8f2546c2adca42e99539c4c67c06151a"
            ]
          }
        },
        "c5b8c3ed934b4b8a81f1b1f2343df9cc": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0e88793aa5a84c0cb42002605c5dab44": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_09aa4ba8aae44478b029224736505dbb",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 897396780,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 897396780,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_bc87fb78642848b195fac99f62bf70be"
          }
        },
        "8f2546c2adca42e99539c4c67c06151a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_bfcadd3a90384cec8185659b3fa7ad56",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 897M/897M [00:25&lt;00:00, 35.0MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_937dc13a83e3419690064f0cc2ea0e3a"
          }
        },
        "09aa4ba8aae44478b029224736505dbb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "bc87fb78642848b195fac99f62bf70be": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "bfcadd3a90384cec8185659b3fa7ad56": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "937dc13a83e3419690064f0cc2ea0e3a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Advanced/ProtAlbert.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "12mrlxKlq0iH",
        "colab_type": "text"
      },
      "source": [
        "<h3> Extracting protein sequences' features using ProtAlbert pretrained-model <h3>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "07plbLYyq0iI",
        "colab_type": "text"
      },
      "source": [
        "<b>1. Load necessry libraries including huggingface transformers<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Z0nPTe1qq6Db",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        },
        "outputId": "43b11e61-b03f-4bae-abd1-81447d570a98"
      },
      "source": [
        "!pip install -q transformers"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 778kB 7.3MB/s \n",
            "\u001b[K     |████████████████████████████████| 1.1MB 24.1MB/s \n",
            "\u001b[K     |████████████████████████████████| 890kB 42.7MB/s \n",
            "\u001b[K     |████████████████████████████████| 3.0MB 54.0MB/s \n",
            "\u001b[?25h  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PX7LjHhdq0iJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import torch\n",
        "from transformers import AlbertModel, AlbertTokenizer\n",
        "import re\n",
        "import os\n",
        "import requests\n",
        "from tqdm.auto import tqdm"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2h9f55Xzq0iP",
        "colab_type": "text"
      },
      "source": [
        "<b>2. Load the vocabulary and ProtAlbert Model<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "erBUxiG3q0iQ",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "5d36029ccf7d4b9c8e0a4e5d7392e944",
            "067ec527c68745adb72775e5515763b1",
            "21783d5c40514b208b60a3dbfbd468f1",
            "0c66d3d94b2e4d8297c93cad9a722bfc",
            "1359af5c37184a60af691b2fd78a20df",
            "d79a084d2a124832b0200c79448b04b4",
            "a49616b738d14cff917e1729fde435c4",
            "d5ac57372b6c4f3e916c090e51c571e7"
          ]
        },
        "outputId": "829e7483-0c18-4206-a5c4-20cca6eff513"
      },
      "source": [
        "tokenizer = AlbertTokenizer.from_pretrained(\"Rostlab/prot_albert\", do_lower_case=False)"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "5d36029ccf7d4b9c8e0a4e5d7392e944",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=238187.0, style=ProgressStyle(descripti…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6U-fK1Usq0iS",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 115,
          "referenced_widgets": [
            "7ecbd6466c624fb9b8ef305f96e91f9e",
            "94cb3cf81069411287551f8966516876",
            "990a7e655f9a47f482685f97eb049766",
            "c9e40a709153400799dbb40546643619",
            "826e06ed24e24207aa1cc3476b62ceea",
            "c1c8a6051b874c94ba48551cfb61852e",
            "c885e37af03b4429a8544e6b4ee76552",
            "22efb24d14d243b8b04299bcfae10650",
            "60ad2075fad946eb90daefc361845a5a",
            "c5b8c3ed934b4b8a81f1b1f2343df9cc",
            "0e88793aa5a84c0cb42002605c5dab44",
            "8f2546c2adca42e99539c4c67c06151a",
            "09aa4ba8aae44478b029224736505dbb",
            "bc87fb78642848b195fac99f62bf70be",
            "bfcadd3a90384cec8185659b3fa7ad56",
            "937dc13a83e3419690064f0cc2ea0e3a"
          ]
        },
        "outputId": "7b4dde58-146f-44a1-9026-52c7069db8ea"
      },
      "source": [
        "model = AlbertModel.from_pretrained(\"Rostlab/prot_albert\")"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7ecbd6466c624fb9b8ef305f96e91f9e",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=505.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "60ad2075fad946eb90daefc361845a5a",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=897396780.0, style=ProgressStyle(descri…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UfQF6_mKq0iU",
        "colab_type": "text"
      },
      "source": [
        "<b>3. Load the model into the GPU if avilabile and switch to inference mode<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u69Is8iJq0iV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wsoezHpkq0iX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model = model.to(device)\n",
        "model = model.eval()"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1_jFQ34wq0ia",
        "colab_type": "text"
      },
      "source": [
        "<b>4. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hNAgRu4Fq0ib",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sequences_Example = [\"A E T C Z A O\",\"S K T Z P\"]"
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VhBH8zh3q0id",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sequences_Example = [re.sub(r\"[UZOB]\", \"X\", sequence) for sequence in sequences_Example]"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xOUp3Vg_q0if",
        "colab_type": "text"
      },
      "source": [
        "<b>5. Tokenize, encode sequences and load it into the GPU if possibile<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Md1eVbXdq0ig",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_Y8xvJpRq0ii",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "input_ids = torch.tensor(ids['input_ids']).to(device)\n",
        "attention_mask = torch.tensor(ids['attention_mask']).to(device)"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9uPjdDpFq0ik",
        "colab_type": "text"
      },
      "source": [
        "<b>6. Extracting sequences' features and load it into the CPU if needed<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BlGklmP3q0il",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "with torch.no_grad():\n",
        "    embedding = model(input_ids=input_ids,attention_mask=attention_mask)[0]"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Fm9uGvnUq0in",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "embedding = embedding.cpu().numpy()"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2CgHvhwIq0ip",
        "colab_type": "text"
      },
      "source": [
        "<b>7. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by Albert model<b>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IXuCzsfNq0ip",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "features = [] \n",
        "for seq_num in range(len(embedding)):\n",
        "    seq_len = (attention_mask[seq_num] == 1).sum()\n",
        "    seq_emd = embedding[seq_num][1:seq_len-1]\n",
        "    features.append(seq_emd)"
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5148XKrFq0is",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 391
        },
        "outputId": "ae1265de-c36e-49aa-dbbe-91249fe6bc06"
      },
      "source": [
        "print(features)"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[array([[ 0.05041128,  0.01670685, -0.06041763, ...,  0.03879201,\n",
            "         0.0674293 , -0.02243809],\n",
            "       [ 0.04481388,  0.02007382, -0.06041713, ...,  0.01449793,\n",
            "         0.07357242, -0.02993674],\n",
            "       [ 0.04433505,  0.02113645, -0.0598816 , ...,  0.01485567,\n",
            "         0.07245695, -0.02932317],\n",
            "       ...,\n",
            "       [ 0.04479406,  0.01773301, -0.06211168, ...,  0.01833694,\n",
            "         0.0708717 , -0.02901287],\n",
            "       [ 0.03966058,  0.04050258, -0.01892234, ...,  0.02171973,\n",
            "         0.06361844, -0.04444959],\n",
            "       [ 0.04467377,  0.02889517, -0.06304097, ...,  0.00830418,\n",
            "         0.0829025 , -0.02485315]], dtype=float32), array([[-0.03166402,  0.01354338, -0.02900368, ..., -0.11964945,\n",
            "        -0.11535365,  0.07926654],\n",
            "       [-0.04238719,  0.01891202, -0.02534003, ..., -0.12101925,\n",
            "        -0.10938749,  0.09651614],\n",
            "       [-0.04247086,  0.02128087, -0.02400087, ..., -0.12410779,\n",
            "        -0.10866657,  0.10307116],\n",
            "       [-0.04353952,  0.02157599, -0.02257544, ..., -0.12234627,\n",
            "        -0.10860701,  0.09839469],\n",
            "       [-0.04395707,  0.02232352, -0.02410617, ..., -0.12444247,\n",
            "        -0.10650334,  0.11278596]], dtype=float32)]\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}